package cn.cihon.spark.feature

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.feature.ChiSqSelector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils

/**
  * Created by eeexiu on 16-11-26.
  */
object ChiSqSelectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("ChiSqSelectorExample").setMaster("local[2]")
    val sc = new SparkContext(conf)

    /**
      *   输入数据格式：
      *
      *   0 127:60 128:96
      *
      *   第一位是分类，后面是标签：值，要求必须是数字
      */
    // Load some data in libsvm format
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    // Discretize data in 16 equal bins since ChiSqSelector requires categorical features
    // Even though features are doubles, the ChiSqSelector treats each unique value as a category
    val discretizedData = data.map { lp =>
      LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor }))
    }
    // Create ChiSqSelector that will select top 50 of 692 features
    val selector = new ChiSqSelector(50)
    // Create ChiSqSelector model (selecting features)
    val transformer = selector.fit(discretizedData)
    // Filter the top 50 features from each feature vector
    val filteredData = discretizedData.map { lp =>
      LabeledPoint(lp.label, transformer.transform(lp.features))
    }
    // $example off$

    println("filtered data: ")
    filteredData.foreach(x => println(x))

    sc.stop()
  }
}
